Code
from google.colab import drive
drive.mount('/content/drive')
import polars as plMounted at /content/drive
Reasons for Leaving School Early - Quarto Report
from google.colab import drive
drive.mount('/content/drive')
import polars as plMounted at /content/drive
pl.read_csv('drive/MyDrive/Colab Notebooks/unicef_metadata1.csv',
infer_schema_length=10000, # Increase infer_schema_length
schema_overrides={'Population, total': pl.Float64}, # Or, specify the correct dtype
)
meta = pl.read_csv('drive/MyDrive/Colab Notebooks/unicef_metadata1.csv',
infer_schema_length=10000,
schema_overrides={'Population, total': pl.Float64},
)pl.read_csv('drive/MyDrive/Colab Notebooks/Out-of-school rate for adolescents of lower secondary school age (administrative data).csv')
OOSR = pl.read_csv('drive/MyDrive/Colab Notebooks/Out-of-school rate for adolescents of lower secondary school age (administrative data).csv')import polars as pl
import geopandas as gpd
import plotly.express as px
# Load the Excel file
df = OOSR
# Filter where sex == 'Total'
df_total = df.filter(pl.col("sex") == "Total")
# Group by country and compute average dropout rate
avg_dropout = df_total.group_by("country").agg(pl.col("obs_value").mean().alias("avg_dropout_rate"))
# Convert to pandas for Plotly
avg_df = avg_dropout.to_pandas()
# Create a choropleth map
fig = px.choropleth(
avg_df,
locations="country",
locationmode="country names",
color="avg_dropout_rate",
color_continuous_scale="Reds",
title="Average Dropout Rate per Country (Adolescents)"
)
fig.show()!pip install --upgrade polars
!pip install country_converter
!pip install plotnine geopandas
from plotnine import *
import country_converter as coco
import polars as pl
gender_data = (OOSR
.filter(pl.col("sex") != "Total")
.group_by(["country","sex"])
.agg(pl.col("obs_value").median())
.pivot(values="obs_value",index="country",columns="sex"))
cc = coco.CountryConverter()
countries = gender_data.get_column("country").to_list()
continent_list = cc.convert(names=countries, to='continent', not_found=None)
gender_data_with_continent = gender_data.with_columns(
pl.Series(continent_list).alias("continent")
)
# Melt the DataFrame for plotting
melted_data = gender_data_with_continent.melt(id_vars=["country", "continent"], variable_name="sex", value_name="obs_value")
# Create the plot using plotnine
continent_medians = (
melted_data
.group_by(["continent", "sex"])
.agg(pl.col("obs_value").median().alias("median_obs_value"))
)
(ggplot(continent_medians, aes(x="continent", y="median_obs_value",fill="sex"))
+ geom_col(position="dodge")
+ labs(x="Continent", y="Median Out of School Rate", fill="Sex")
+ theme(axis_text_x=element_text(rotation=45))
+ theme_minimal()
+ scale_fill_manual(values=["purple","pink"])
)Requirement already satisfied: polars in /usr/local/lib/python3.11/dist-packages (1.27.1)
Requirement already satisfied: country_converter in /usr/local/lib/python3.11/dist-packages (1.3)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.11/dist-packages (from country_converter) (2.2.2)
Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.0->country_converter) (2.0.2)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.0->country_converter) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.0->country_converter) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.0->country_converter) (2025.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas>=1.0->country_converter) (1.17.0)
Requirement already satisfied: plotnine in /usr/local/lib/python3.11/dist-packages (0.14.5)
Requirement already satisfied: geopandas in /usr/local/lib/python3.11/dist-packages (1.0.1)
Requirement already satisfied: matplotlib>=3.8.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (3.10.0)
Requirement already satisfied: pandas>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (2.2.2)
Requirement already satisfied: mizani~=0.13.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (0.13.3)
Requirement already satisfied: numpy>=1.23.5 in /usr/local/lib/python3.11/dist-packages (from plotnine) (2.0.2)
Requirement already satisfied: scipy>=1.8.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (1.14.1)
Requirement already satisfied: statsmodels>=0.14.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (0.14.4)
Requirement already satisfied: pyogrio>=0.7.2 in /usr/local/lib/python3.11/dist-packages (from geopandas) (0.10.0)
Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from geopandas) (24.2)
Requirement already satisfied: pyproj>=3.3.0 in /usr/local/lib/python3.11/dist-packages (from geopandas) (3.7.1)
Requirement already satisfied: shapely>=2.0.0 in /usr/local/lib/python3.11/dist-packages (from geopandas) (2.1.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (1.3.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (4.57.0)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (1.4.8)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (11.1.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (3.2.3)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=2.2.0->plotnine) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=2.2.0->plotnine) (2025.2)
Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from pyogrio>=0.7.2->geopandas) (2025.1.31)
Requirement already satisfied: patsy>=0.5.6 in /usr/local/lib/python3.11/dist-packages (from statsmodels>=0.14.0->plotnine) (1.0.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.7->matplotlib>=3.8.0->plotnine) (1.17.0)
DeprecationWarning: The argument `columns` for `DataFrame.pivot` is deprecated. It has been renamed to `on`.
<ipython-input-10-033859602bac>:23: DeprecationWarning: `DataFrame.melt` is deprecated. Use `unpivot` instead, with `index` instead of `id_vars` and `on` instead of `value_vars`
gdp_dropout = (meta.join(OOSR, on="country")
.group_by("country")
.agg([
pl.col("GDP per capita (constant 2015 US$)").mean().alias("GDP_per_capita"),
pl.col("obs_value").mean().alias("avg_obs_value")
]))
(ggplot(gdp_dropout, aes(x="avg_obs_value", y="GDP_per_capita"))
+ geom_point()
+ labs(x="Out of School Rate", y="GDP per capita (constant 2015 US$)")
+ theme_seaborn()
)/usr/local/lib/python3.11/dist-packages/plotnine/layer.py:364: PlotnineWarning: geom_point : Removed 2 rows containing missing values.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load data
metadata = meta
dropout = OOSR
metadata_pd = metadata.to_pandas()
dropout_pd = dropout.to_pandas()
# Filter dropout data for 'Total' sex
dropout_total = dropout_pd[dropout_pd["sex"] == "Total"]
# Calculate max dropout rate per country
dropout_max = dropout_total.groupby("country")["obs_value"].max().reset_index()
dropout_max.columns = ["country", "max_dropout"]
# Calculate median life expectancy per country
life_expectancy = metadata_pd.groupby("country")["Life expectancy at birth, total (years)"].median().reset_index()
life_expectancy.columns = ["country", "median_life_expectancy"]
# Merge both datasets
merged = pd.merge(dropout_max, life_expectancy, on="country", how="inner")
# Set up heatmap-style DataFrame
heatmap_df = merged.pivot_table(
index="country",
values="max_dropout",
columns="median_life_expectancy"
)
# Sort countries for readability
merged_sorted = merged.sort_values("median_life_expectancy", ascending=False)
# Plot heatmap (using just country as axis for simplicity)
plt.figure(figsize=(10, 8))
sns.heatmap(
data=merged_sorted.set_index("country")[["max_dropout"]],
cmap="YlOrRd",
annot=False,
fmt=".1f",
linewidths=0.5,
cbar_kws={'label': 'Max Dropout Rate (%)'},
xticklabels=False
)
plt.title("Heatmap of Max Dropout Rate vs Median Life Expectancy by Country")
plt.xlabel("Max Dropout Rate")
plt.ylabel("Country (sorted by life expectancy)")
plt.tight_layout(pad=0.5)
plt.show()Please click this link to view the Military graph
import pandas as pd
import plotly.express as px
# Load data
metadata_pd = meta
dropout_pd = OOSR
metadata_pd = metadata.to_pandas()
dropout_pd = dropout.to_pandas()
# Filter dropout data for 'Total' sex
dropout_total = dropout_pd[dropout_pd["sex"] == "Total"]
# Median dropout rate per country
dropout_median = dropout_total.groupby("country")["obs_value"].median().reset_index()
dropout_median.columns = ["country", "median_dropout"]
# Aggregate metadata
metadata_grouped = metadata_pd.groupby("country").agg({
"Military expenditure (% of GDP)": "max",
"Life expectancy at birth, total (years)": "median",
"GDP per capita (constant 2015 US$)": "max"
}).reset_index()
# Merge
merged = pd.merge(metadata_grouped, dropout_median, on="country", how="inner")
merged = merged.rename(columns={
"Military expenditure (% of GDP)": "military_expenditure",
"Life expectancy at birth, total (years)": "life_expectancy",
"GDP per capita (constant 2015 US$)": "gdp_per_capita"
})
# Define income level
def income_level(gdp):
if gdp >= 45000:
return "High Income"
elif gdp >= 20000:
return "Middle Income"
else:
return "Low Income"
merged["income_level"] = merged["gdp_per_capita"].apply(income_level)
# Clean up
merged = merged.dropna(subset=["military_expenditure", "median_dropout", "life_expectancy"])
# Sunburst chart
fig = px.sunburst(
merged,
path=["income_level", "country"],
values="military_expenditure",
color="median_dropout",
color_continuous_scale="YlOrRd",
hover_data=["life_expectancy"],
labels={
"military_expenditure": "Military Expenditure (% of GDP)",
"median_dropout": "Dropout Rate (%)",
"life_expectancy": "Life Expectancy (Years)",
"income_level": "Income Level"
},
title="Military Expenditure Breakdown by Income Group<br>(Dropout Rate = Color, Life Expectancy on Hover)"
)
fig.update_traces(insidetextorientation='radial')
fig.show()Income (USD) is divided by: High > 45,000, Medium >= 20,000, Low < 20,000
Please click this link to view the Life Expectancy vs Birth Rate graph
import plotly.express as px
import pandas as pd
# Load metadata and dropout files
metadata = meta
dropout = OOSR
# Convert Polars DataFrames to pandas DataFrames
metadata_pd = metadata.to_pandas()
dropout_pd = dropout.to_pandas()
# Filter dropout data for 'Total' sex
dropout_total = dropout_pd[dropout_pd["sex"] == "Total"] # Use pandas filtering
# Calculate median dropout rate per country
dropout_median = dropout_total.groupby("country")["obs_value"].median().reset_index()
dropout_median.columns = ["country", "median_dropout"]
# Get max birth rate and life expectancy per country using pandas
metadata_grouped = metadata_pd.groupby("country").agg({
"Birth rate, crude (per 1,000 people)": "max",
"Life expectancy at birth, total (years)": "max",
"GDP per capita (constant 2015 US$)": "max"
}).reset_index()
# Merge with dropout data (both are now pandas DataFrames)
merged = pd.merge(metadata_grouped, dropout_median, on="country", how="inner")
# Rename for easier access
merged = merged.rename(columns={
"Birth rate, crude (per 1,000 people)": "birth_rate",
"Life expectancy at birth, total (years)": "life_expectancy",
"GDP per capita (constant 2015 US$)": "gdp_per_capita"
})
# Create income level calculated field
def income_level(gdp):
if gdp >= 45000:
return "High Income"
elif gdp >= 20000:
return "Middle Income"
else:
return "Low Income"
merged["income_level"] = merged["gdp_per_capita"].apply(income_level)
# Color mapping
color_map = {
"High Income": "green",
"Middle Income": "orange",
"Low Income": "gold"
}
merged["color"] = merged["income_level"].map(color_map)
fig = px.scatter(
merged,
x="birth_rate",
y="life_expectancy",
size="median_dropout",
color="income_level",
color_discrete_map={ # Custom color mapping
"High Income": "green",
"Middle Income": "orange",
"Low Income": "blue"
},
hover_name="country", # Show country name on hover
hover_data={ # Show additional data on hover
"birth_rate": True,
"life_expectancy": True,
"median_dropout": True,
"income_level": True
},
title="Max Birth Rate vs. Life Expectancy by Country (Interactive)<br>(Trendline: Linear Regression)",
labels={
"birth_rate": "Max Birth Rate (per 1,000 people)",
"life_expectancy": "Max Life Expectancy (years)",
"median_dropout": "Median Dropout Rate",
"income_level": "Income Level"
},
trendline="ols"
)
fig.show()